1 Effect of UPSTM-Based Decorrelation on Feature Discovery

1.0.1 Loading the libraries

library("FRESA.CAD")
library(readxl)
library(igraph)
library(umap)
library(tsne)
library(entropy)

op <- par(no.readonly = TRUE)
pander::panderOptions('digits', 3)
pander::panderOptions('table.split.table', 400)
pander::panderOptions('keep.trailing.zeros',TRUE)

1.1 Material and Methods

Data from the speech features

1.2 The Data


pd_speech_features <- as.data.frame(read_excel("~/GitHub/FCA/Data/pd_speech_features.xlsx",sheet = "pd_speech_features", range = "A2:ACB758"))

1.2.1 The Average of the Three Repetitions

Each subject had three repeated observations. Here I’ll use the average of the three experiments per subject.

rep1Parkison <- subset(pd_speech_features,RID==1)
rownames(rep1Parkison) <- rep1Parkison$id
rep1Parkison$id <- NULL
rep1Parkison$RID <- NULL
rep1Parkison[,1:ncol(rep1Parkison)] <- sapply(rep1Parkison,as.numeric)

rep2Parkison <- subset(pd_speech_features,RID==2)
rownames(rep2Parkison) <- rep2Parkison$id
rep2Parkison$id <- NULL
rep2Parkison$RID <- NULL
rep2Parkison[,1:ncol(rep2Parkison)] <- sapply(rep2Parkison,as.numeric)

rep3Parkison <- subset(pd_speech_features,RID==3)
rownames(rep3Parkison) <- rep3Parkison$id
rep3Parkison$id <- NULL
rep3Parkison$RID <- NULL
rep3Parkison[,1:ncol(rep3Parkison)] <- sapply(rep3Parkison,as.numeric)

whof <- !(colnames(rep1Parkison) %in% c("gender","class"));
avgParkison <- rep1Parkison;
avgParkison[,whof] <- (rep1Parkison[,whof] + rep2Parkison[,whof] + rep3Parkison[,whof])/3


signedlog <- function(x) { return (sign(x)*log(abs(1.0e12*x)+1.0))}
whof <- !(colnames(avgParkison) %in% c("gender","class"));
avgParkison[,whof] <- signedlog(avgParkison[,whof])

1.2.1.1 Standarize the names for the reporting

studyName <- "Parkinsons"
dataframe <- avgParkison
outcome <- "class"

TopVariables <- 10

thro <- 0.80
cexheat = 0.15

1.3 Generaring the report

1.3.1 Libraries

Some libraries

library(psych)
library(whitening)
library("vioplot")

1.3.2 Data specs

pander::pander(c(rows=nrow(dataframe),col=ncol(dataframe)-1))
rows col
252 753
pander::pander(table(dataframe[,outcome]))
0 1
64 188

varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]

largeSet <- length(varlist) > 1000

1.3.3 Scaling the data

Scaling and removing near zero variance columns and highly co-linear(r>0.99999) columns


  ### Some global cleaning
  sdiszero <- apply(dataframe,2,sd) > 1.0e-16
  dataframe <- dataframe[,sdiszero]

  varlist <- colnames(dataframe)[colnames(dataframe) != outcome]
  tokeep <- c(as.character(correlated_Remove(dataframe,varlist,thr=0.99999)),outcome)
  dataframe <- dataframe[,tokeep]

  varlist <- colnames(dataframe)
  varlist <- varlist[varlist != outcome]


dataframe <- FRESAScale(dataframe,method="OrderLogit")$scaledData

1.4 The heatmap of the data


if (!largeSet)
{
  
  hm <- heatMaps(data=dataframe,
                 Outcome=outcome,
                 Scale=TRUE,
                 hCluster = "row",
                 xlab="Feature",
                 ylab="Sample",
                 srtCol=45,
                 srtRow=45,
                 cexCol=cexheat,
                 cexRow=cexheat
                 )
  par(op)
}

1.4.0.1 Correlation Matrix of the Data

The heat map of the data


if (!largeSet)
{

  par(cex=0.6,cex.main=0.85,cex.axis=0.7)
  #cormat <- Rfast::cora(as.matrix(dataframe[,varlist]),large=TRUE)
  cormat <- cor(dataframe[,varlist],method="pearson")
  cormat[is.na(cormat)] <- 0
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
  #                  scale = "row",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Original Correlation",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="Pearson Correlation",
                    xlab="Feature", ylab="Feature")
  diag(cormat) <- 0
  print(max(abs(cormat)))
}

[1] 0.9999953

1.5 The decorrelation


DEdataframe <- IDeA(dataframe,verbose=TRUE,thr=thro)
#> 
#>  Included: 681 , Uni p: 0.01697637 , Uncorrelated Base: 164 , Outcome-Driven Size: 0 , Base Size: 164 
#> 
#> 
 1 <R=1.000,w=  1,N=  358>, Top: 82( 42 )[ 1 : 82 Fa= 82 : 0.975 ]( 82 , 232 , 0 ),<|>Tot Used: 314 , Added: 232 , Zero Std: 0 , Max Cor: 1.000
#> 
 2 <R=1.000,w=  1,N=  358>, Top: 23( 7 )[ 1 : 23 Fa= 105 : 0.975 ]( 23 , 46 , 82 ),<|>Tot Used: 347 , Added: 46 , Zero Std: 0 , Max Cor: 0.996
#> 
 3 <R=0.996,w=  1,N=  358>, Top: 11( 4 )[ 1 : 11 Fa= 111 : 0.973 ]( 11 , 14 , 105 ),<|>Tot Used: 361 , Added: 14 , Zero Std: 0 , Max Cor: 0.973
#> 
 4 <R=0.973,w=  2,N=  218>, Top: 73( 3 )[ 1 : 73 Fa= 145 : 0.937 ]( 72 , 105 , 111 ),<|>Tot Used: 430 , Added: 105 , Zero Std: 0 , Max Cor: 0.981
#> 
 5 <R=0.981,w=  2,N=  218>, Top: 14( 2 )[ 1 : 14 Fa= 151 : 0.941 ]( 14 , 18 , 145 ),<|>Tot Used: 439 , Added: 18 , Zero Std: 0 , Max Cor: 0.955
#> 
 6 <R=0.955,w=  3,N=  167>, Top: 55( 3 )=[ 2 : 55 Fa= 168 : 0.902 ]( 55 , 83 , 151 ),<|>Tot Used: 475 , Added: 83 , Zero Std: 0 , Max Cor: 0.987
#> 
 7 <R=0.987,w=  3,N=  167>, Top: 9( 1 )[ 1 : 9 Fa= 172 : 0.894 ]( 9 , 9 , 168 ),<|>Tot Used: 479 , Added: 9 , Zero Std: 0 , Max Cor: 0.893
#> 
 8 <R=0.893,w=  4,N=  196>, Top: 63( 6 )=[ 2 : 63 Fa= 188 : 0.856 ]( 59 , 107 , 172 ),<|>Tot Used: 518 , Added: 107 , Zero Std: 0 , Max Cor: 0.984
#> 
 9 <R=0.984,w=  4,N=  196>, Top: 12( 1 )[ 1 : 12 Fa= 192 : 0.842 ]( 12 , 13 , 188 ),<|>Tot Used: 522 , Added: 13 , Zero Std: 0 , Max Cor: 0.853
#> 
 10 <R=0.853,w=  5,N=   20>, Top: 9( 1 )[ 1 : 9 Fa= 194 : 0.800 ]( 9 , 11 , 192 ),<|>Tot Used: 524 , Added: 11 , Zero Std: 0 , Max Cor: 0.869
#> 
 11 <R=0.869,w=  5,N=   20>, Top: 2( 1 )[ 1 : 2 Fa= 195 : 0.800 ]( 2 , 2 , 194 ),<|>Tot Used: 524 , Added: 2 , Zero Std: 0 , Max Cor: 0.797
#> 
 12 <R=0.797,w=  6,N=    0>
#> 
 [ 12 ], 0.7971068 Decor Dimension: 524 . Cor to Base: 256 , ABase: 27 , Outcome Base: 0 
#> 
varlistc <- colnames(DEdataframe)[colnames(DEdataframe) != outcome]

pander::pander(sum(apply(dataframe[,varlist],2,var)))

718

pander::pander(sum(apply(DEdataframe[,varlistc],2,var)))

304

pander::pander(entropy(discretize(unlist(dataframe[,varlist]), 256)))

4.83

pander::pander(entropy(discretize(unlist(DEdataframe[,varlistc]), 256)))

3.61

1.5.1 The decorrelation matrix


if (!largeSet)
{

  par(cex=0.6,cex.main=0.85,cex.axis=0.7)
  
  UPSTM <- attr(DEdataframe,"UPSTM")
  
  gplots::heatmap.2(1.0*(abs(UPSTM)>0),
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Decorrelation matrix",
                    cexRow = cexheat,
                    cexCol = cexheat,
                   srtCol=45,
                   srtRow=45,
                    key.title=NA,
                    key.xlab="|Beta|>0",
                    xlab="Output Feature", ylab="Input Feature")
  
  par(op)
}

1.6 The heatmap of the decorrelated data

if (!largeSet)
{

  hm <- heatMaps(data=DEdataframe,
                 Outcome=outcome,
                 Scale=TRUE,
                 hCluster = "row",
                 cexRow = cexheat,
                 cexCol = cexheat,
                 srtCol=45,
                 srtRow=45,
                 xlab="Feature",
                 ylab="Sample")
  par(op)
}

1.7 The correlation matrix after decorrelation

if (!largeSet)
{

  cormat <- cor(DEdataframe[,varlistc],method="pearson")
  cormat[is.na(cormat)] <- 0
  
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Correlation after IDeA",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="Pearson Correlation",
                    xlab="Feature", ylab="Feature")
  
  par(op)
  diag(cormat) <- 0
  print(max(abs(cormat)))
}

[1] 0.7971068

1.8 U-MAP Visualization of features

1.8.1 The UMAP based on LASSO on Raw Data

classes <- unique(dataframe[,outcome])
raincolors <- rainbow(length(classes))
names(raincolors) <- classes
datasetframe.umap = umap(scale(dataframe[,varlist]),n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: Original",t='n')
text(datasetframe.umap$layout,labels=dataframe[,outcome],col=raincolors[dataframe[,outcome]+1])

1.8.2 The decorralted UMAP


datasetframe.umap = umap(scale(DEdataframe[,varlistc]),n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: After IDeA",t='n')
text(datasetframe.umap$layout,labels=DEdataframe[,outcome],col=raincolors[DEdataframe[,outcome]+1])

1.9 Univariate Analysis

1.9.1 Univariate



univarRAW <- uniRankVar(varlist,
               paste(outcome,"~1"),
               outcome,
               dataframe,
               rankingTest="AUC")

100 : std_MFCC_2nd_coef 200 : app_entropy_log_3_coef 300 : app_LT_TKEO_mean_7_coef 400 : tqwt_entropy_log_dec_15 500 : tqwt_medianValue_dec_7
600 : tqwt_stdValue_dec_35 700 : tqwt_skewnessValue_dec_27




univarDe <- uniRankVar(varlistc,
               paste(outcome,"~1"),
               outcome,
               DEdataframe,
               rankingTest="AUC",
               )

100 : La_std_MFCC_2nd_coef 200 : La_app_entropy_log_3_coef 300 : La_app_LT_TKEO_mean_7_coef 400 : La_tqwt_entropy_log_dec_15 500 : tqwt_medianValue_dec_7
600 : tqwt_stdValue_dec_35 700 : tqwt_skewnessValue_dec_27

1.9.2 Final Table


univariate_columns <- c("caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC")

##topfive
topvar <- c(1:length(varlist)) <= TopVariables
pander::pander(univarRAW$orderframe[topvar,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
std_delta_delta_log_energy 0.251 0.825 -0.736 0.804 0.483 0.798
std_delta_log_energy 0.250 0.841 -0.690 0.787 0.469 0.794
std_9th_delta_delta 0.347 0.952 -0.611 0.674 0.766 0.787
std_8th_delta_delta 0.319 0.941 -0.598 0.595 0.862 0.780
std_7th_delta_delta 0.324 0.905 -0.558 0.647 0.977 0.776
tqwt_entropy_log_dec_12 -0.147 0.876 0.764 0.876 0.676 0.770
std_6th_delta_delta 0.311 0.851 -0.470 0.540 0.896 0.768
std_8th_delta 0.310 0.950 -0.587 0.637 0.971 0.767
std_9th_delta 0.306 0.885 -0.519 0.660 0.330 0.764
tqwt_entropy_shannon_dec_12 -0.282 0.940 0.593 0.833 0.145 0.763


topLAvar <- univarDe$orderframe$Name[str_detect(univarDe$orderframe$Name,"La_")]
topLAvar <- unique(c(univarDe$orderframe$Name[topvar],topLAvar[1:as.integer(TopVariables/2)]))
finalTable <- univarDe$orderframe[topLAvar,univariate_columns]

theLaVar <- rownames(finalTable)[str_detect(rownames(finalTable),"La_")]

pander::pander(univarDe$orderframe[topLAvar,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
std_delta_delta_log_energy 0.2506 0.825 -0.736 0.804 4.83e-01 0.798
std_8th_delta_delta 0.3190 0.941 -0.598 0.595 8.62e-01 0.780
tqwt_entropy_shannon_dec_12 -0.2819 0.940 0.593 0.833 1.45e-01 0.763
La_tqwt_entropy_shannon_dec_33 -0.0534 0.230 0.197 0.357 3.85e-01 0.759
mean_MFCC_2nd_coef -0.3598 1.433 -1.933 1.997 2.87e-06 0.753
La_tqwt_entropy_shannon_dec_20 -0.1988 0.533 0.093 0.263 4.87e-01 0.744
La_tqwt_entropy_shannon_dec_17 -0.1131 0.508 0.135 0.175 6.94e-01 0.734
La_tqwt_stdValue_dec_32 0.0206 0.168 -0.156 0.273 2.33e-01 0.734
La_tqwt_kurtosisValue_dec_33 0.0505 0.377 -0.301 0.511 1.54e-01 0.732
tqwt_kurtosisValue_dec_20 0.2743 0.943 -0.454 0.746 5.50e-01 0.727

dc <- getLatentCoefficients(DEdataframe)
fscores <- attr(DEdataframe,"fscore")

theSigDc <- dc[theLaVar]
names(theSigDc) <- NULL
theSigDc <- unique(names(unlist(theSigDc)))


theFormulas <- dc[rownames(finalTable)]
deFromula <- character(length(theFormulas))
names(deFromula) <- rownames(finalTable)

pander::pander(c(mean=mean(sapply(dc,length)),total=length(dc),fraction=length(dc)/(ncol(dataframe)-1)))
mean total fraction
2.48 476 0.639


allSigvars <- names(dc)



dx <- names(deFromula)[1]
for (dx in names(deFromula))
{
  coef <- theFormulas[[dx]]
  cname <- names(theFormulas[[dx]])
  names(cname) <- cname
  for (cf in names(coef))
  {
    if (cf != dx)
    {
      if (coef[cf]>0)
      {
        deFromula[dx] <- paste(deFromula[dx],
                               sprintf("+ %5.3f*%s",coef[cf],cname[cf]))
      }
      else
      {
        deFromula[dx] <- paste(deFromula[dx],
                               sprintf("%5.3f*%s",coef[cf],cname[cf]))
      }
    }
  }
}

finalTable <- rbind(finalTable,univarRAW$orderframe[theSigDc[!(theSigDc %in% rownames(finalTable))],univariate_columns])


orgnamez <- rownames(finalTable)
orgnamez <- str_remove_all(orgnamez,"La_")
finalTable$RAWAUC <- univarRAW$orderframe[orgnamez,"ROCAUC"]
finalTable$DecorFormula <- deFromula[rownames(finalTable)]
finalTable$fscores <- fscores[rownames(finalTable)]

Final_Columns <- c("DecorFormula","caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC","RAWAUC","fscores")

finalTable <- finalTable[order(-finalTable$ROCAUC),]
pander::pander(finalTable[,Final_Columns])
  DecorFormula caseMean caseStd controlMean controlStd controlKSP ROCAUC RAWAUC fscores
std_delta_delta_log_energy 0.2506 0.825 -0.73605 0.804 4.83e-01 0.798 0.798 2
std_8th_delta_delta 0.3190 0.941 -0.59810 0.595 8.62e-01 0.780 0.780 6
tqwt_entropy_shannon_dec_12 -0.2819 0.940 0.59251 0.833 1.45e-01 0.763 0.763 10
La_tqwt_entropy_shannon_dec_33 -0.868tqwt_entropy_shannon_dec_31 + 1.000tqwt_entropy_shannon_dec_33 -0.0534 0.230 0.19681 0.357 3.85e-01 0.759 0.621 0
mean_MFCC_2nd_coef -0.3598 1.433 -1.93344 1.997 2.87e-06 0.753 0.753 NA
La_tqwt_entropy_shannon_dec_20 + 1.000tqwt_entropy_shannon_dec_20 + 0.886tqwt_minValue_dec_20 -0.1988 0.533 0.09299 0.263 4.87e-01 0.744 0.622 -1
La_tqwt_entropy_shannon_dec_17 + 1.000tqwt_entropy_shannon_dec_17 + 0.990tqwt_minValue_dec_17 -0.1131 0.508 0.13493 0.175 6.94e-01 0.734 0.709 -1
La_tqwt_stdValue_dec_32 + 1.000tqwt_stdValue_dec_32 -1.002tqwt_stdValue_dec_33 0.0206 0.168 -0.15556 0.273 2.33e-01 0.734 0.573 4
La_tqwt_kurtosisValue_dec_33 -0.809tqwt_kurtosisValue_dec_31 + 1.000tqwt_kurtosisValue_dec_33 0.0505 0.377 -0.30099 0.511 1.54e-01 0.732 0.628 -1
tqwt_kurtosisValue_dec_20 0.2743 0.943 -0.45445 0.746 5.50e-01 0.727 0.727 1
tqwt_entropy_shannon_dec_17 NA -0.5108 1.187 0.25698 0.652 7.06e-03 0.709 0.709 NA
tqwt_minValue_dec_17 NA 0.4016 1.110 -0.12323 0.652 2.66e-02 0.636 0.636 15
tqwt_kurtosisValue_dec_33 NA 0.2156 0.824 -0.11644 0.756 5.93e-02 0.628 0.628 NA
tqwt_stdValue_dec_33 NA -0.0105 0.884 0.43941 0.903 4.60e-01 0.628 0.628 NA
tqwt_entropy_shannon_dec_20 NA -0.4085 1.117 0.08982 0.678 2.47e-01 0.622 0.622 NA
tqwt_entropy_shannon_dec_33 NA -0.0183 0.868 0.32633 0.970 4.68e-01 0.621 0.621 NA
tqwt_stdValue_dec_32 NA 0.0102 0.935 0.28472 0.911 2.65e-01 0.573 0.573 NA
tqwt_minValue_dec_20 NA 0.2365 1.116 0.00358 0.744 4.02e-01 0.544 0.544 9
tqwt_entropy_shannon_dec_31 NA 0.0405 0.977 0.14928 1.028 3.93e-01 0.537 0.537 NA
tqwt_kurtosisValue_dec_31 NA 0.2041 0.837 0.22819 0.904 1.27e-01 0.490 0.490 3